RNA-seq analysis workflow
Code Backup
Preparation
Sample data
Data import (count matrix).
Data import (TPM).
TPM calculation manually
To get GTF file, visit the following link :
https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/
Human
https://hgdownload.soe.ucsc.edu/goldenPath/mm10/bigZips/genes/
Mouse
library(rtracklayer)
library(GenomicFeatures)
library(dplyr)
dir = "path/ref/"
gtf_path <- paste0(dir,"hg38_gtf_ucsc/hg38.refGene.gtf")
gtf <- rtracklayer::import(gtf_path)
txdb <- makeTxDbFromGFF(gtf_path, format="gtf")
# Gene information (start and end)
genes <- genes(txdb)
# Create data frame
gene_lengths <- with(as.data.frame(genes), {
gene_id = mcols(genes)$gene_id
start = start
end = end
width = end - start + 1
data.frame(gene_id, start, end, width)
})
# head(gene_lengths)
# gene_lengths %>% dim()
# Check the data manually
gs = c("TACSTD2","EGFR")
gene_lengths[gene_lengths$gene_id %in% gs,]
# Save output
gene_lengths %>% write.csv(paste0(dir,"hg38_refGene_geneLength.csv"))Calculate TPM sample by sample
Principal component analysis
ggplot(pcaData, aes(x = PC1, y = PC2, fill = group)) +
geom_point(size = 4, alpha = 0.6, shape = 21, color = "black", stroke = 0.5) +
ggrepel::geom_text_repel(aes(label=name),
color="grey6", size=3, hjust= -0.3, vjust=-0.3) +
labs(x = paste("PC1: ", round(100 * PCA_var[1]), "% variance"),
y = paste("PC2: ", round(100 * PCA_var[2]), "% variance")) +
theme_bw() +
theme(legend.title = element_blank()) +
ggtitle("PCA") +
labs(caption = " ")Differentially Expressed Genes (DEG) analysis
DEG data
Visualization of DEGs by Volcanoplot
Traditional version
res = res %>% mutate(DE=ifelse(log2FoldChange >= log2(fc) & padj < pval, 'UP',
ifelse(log2FoldChange <= -log2(fc) & padj < pval, 'DN','no_sig')))
res$DE = factor(res$DE, levels = c('UP','DN','no_sig'))
res %>%
ggplot(aes(log2FoldChange, -log10(padj), color=DE)) +
geom_point(size=1, alpha=0.5) +
scale_color_manual(values = c('red','blue','grey')) +
theme_classic() +
geom_vline(xintercept = c(-log2(fc),log2(fc)), color='grey') +
geom_hline(yintercept = -log10(0.05),color='grey') +
guides(colour = guide_legend(override.aes = list(size=5))) +
ggtitle("Resist/Naive") +
ggeasy::easy_center_title() ## to center titleVolcanoplot (other versions)
Three specific gene groups : black, red, blue
Dots in the plot are colored by the groups.
Axis label represents mathematical expressions.
Prepare data
library(cowplot)
library(dplyr)
library(ggplot2)
library(DESeq2)
dir= '~/Desktop/DF/DFCI_Barbie/DFCI_Barbie_NK_Marco/'
res = read.csv(paste0(dir,"info/published_RNA_seq/DEG_Deseq2_removedSamples_Low_vs_High.csv"), check.names = F)
# Prepare data
res$color = 'not_selected'
gene_vector <- c("HLA-A", "HLA-B", "B2M", "TAP1", "AXL","CXCL11", "CXCL10", "CXCL9", "CCL5", "CCL2")
gene_vector= gene_vector[gene_vector %in% res$Gene]
res[res$Gene %in% gene_vector,]$color = "selected"
res$label = ''
res[res$Gene %in% gene_vector,]$label = res[res$Gene %in% gene_vector,]$Gene# Define gene categories with specific biological functions
black <- c("HLA-A", "HLA-B", "B2M", "TAP1", "CXCL9", "CXCL10", "CXCL11", "CCL2", "CCL5")
black <- black[black %in% res$Gene]
red <- c("IL2", "IL12A", "IL15", "IL18", "IL21", "IL23A", "IFNA", "IFNB")
red <- red[red %in% res$Gene]
blue <- c("TGFB1", "IL10", "IL6")
blue <- blue[blue %in% res$Gene]
# Initialize a label column in the dataset
res$label <- ''
# Update label column to contain gene names for specified categories
res[res$Gene %in% union(black, union(red, blue)),]$label <- res[res$Gene %in% union(black, union(red, blue)),]$Gene
# Assign colors to genes based on their categories
res <- res %>% mutate(label_color = ifelse(Gene %in% black, "black",
ifelse(Gene %in% red, "red",
ifelse(Gene %in% blue, "blue", "other"))))
# Convert label_color to a factor with specific level order
res$label_color <- factor(res$label_color, levels = c("black", "red", "blue", "other"))Volcanoplot 1
res %>%
ggplot(aes(-log2FoldChange, -log10(padj), color = label_color, alpha = label_color)) +
geom_point(data = . %>% filter(!label_color %in% c("black", "red", "blue")),
size = 1, alpha = 0.3, show.legend = F) +
geom_point(data = . %>% filter(label_color %in% c("black", "red", "blue")),
size = 1, show.legend = F) +
scale_color_manual(values = c("grey","black","red","blue")) +
scale_alpha_manual(values = c(1, 1, 1, 0.3)) +
ggrepel::geom_text_repel(data = . %>% filter(label_color %in% c("black", "red", "blue")),
aes(label = label, color = label_color),
size = 4, max.overlaps = Inf, show.legend = FALSE) +
theme_classic() +
xlab(expression(Log[2]*" (Fold Change)")) +
ylab(expression(-Log[10]*" (padj)")) +
theme(axis.text = element_text(size = 14)) Volcanoplot 2
With guidelines for significance in the plot and x axis limit
# Plotting
res %>%
ggplot(aes(-log2FoldChange, -log10(padj),
color = label_color, alpha = label_color)) +
geom_point(data = . %>% filter(!label_color %in% c("black", "red", "blue")),
size = 1, alpha = 0.3, show.legend = FALSE) +
# Plot highlighted genes
geom_point(data = . %>% filter(label_color %in% c("black", "red", "blue")),
size = 1, show.legend = FALSE) +
scale_color_manual(values = c("grey", "black", "red", "blue")) +
# Set the alpha values for points
scale_alpha_manual(values = c(1, 1, 1, 0.3)) +
# Add labels to highlighted genes using ggrepel to avoid overlap
ggrepel::geom_text_repel(data = . %>% filter(label_color %in% c("black", "red", "blue")),
aes(label = label, color = label_color),
size = 4, max.overlaps = Inf, show.legend = FALSE) +
theme_classic() +
xlab(expression(Log[2]*" (Fold Change)")) +
ylab(expression(-Log[10]*" (padj)")) +
theme(axis.text = element_text(size = 14)) +
# Define limits and breaks for the x-axis
xlim(c(-11, 11)) +
scale_x_continuous(breaks = seq(-10, 10, by = 2), limits = c(-10, 11)) +
# Add vertical and horizontal dashed lines at significant thresholds
geom_vline(xintercept = c(-log2(1.5), log2(1.5)), color="grey", linetype = "dashed") +
geom_hline(yintercept = -log10(0.05), color="grey", linetype = "dashed")Gene set enrichment analysis
Perform GSEA
library(clusterProfiler)
gobp <- msigdbr::msigdbr(species = "Mus musculus",
category = "C5", subcategory = "GO:BP") %>%
dplyr::select(gs_name, gene_symbol)
perform_GSEA <- function(res, ref, pvalueCutoff = 1) {
ranking <- function(res) {
df <- res$avg_log2FC
names(df) <- rownames(res)
df <- sort(df, decreasing = TRUE)
return(df)
}
ranked.res <- ranking(res)
x <- clusterProfiler::GSEA(geneList = ranked.res,
TERM2GENE = ref,
pvalueCutoff = pvalueCutoff,
pAdjustMethod = "BH",
verbose = TRUE,
seed = TRUE)
result <- x@result %>% arrange(desc(NES))
result <- result[, c('NES', 'pvalue', 'p.adjust', 'core_enrichment', 'ID')]
return(result)
}
# Application
gsea.res = perform_GSEA(res = deg.cluster3, ref = gpbp)NES plot
# GESA Plot
gsea_nes_plot =function(gsea.res, title){
gsea.res %>% ggplot(aes(reorder(ID, NES), NES)) +
geom_col(aes(fill=p.adjust)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title= "GSEA") +
theme_classic() +
scale_fill_gradient(low = 'red', high = '#E5E7E9') +
theme(axis.text.x= element_text(size=5, face = 'bold'),
axis.text.y= element_text(size=6, face = 'bold'),
axis.title =element_text(size=10)) +ggtitle(title)
}
# Application
gsea_nes_plot(gsea.res[gsea.res$p.adjust <= 0.05,], title = "Cluster3")Enrichment plot for the individual pathway (gseaplot2)
ssGSEA
Single sample GSEA
Single-sample Gene Set Enrichment Analysis (ssGSEA) is an variation
of the GSEA algorithm that instead of calculating enrichment scores for
groups of samples (i.e Control vs Disease) and sets of genes (i.e
pathways), it provides a score for each each sample and gene set
pair.
(https://www.genepattern.org/modules/docs/ssGSEAProjection/4#gsc.tab=0)
## Perform ssgsea
library(corto)
## Input data : tpm (count.mtx is accepted as well)
test = ssgsea(tpm,hallmarkList)
## Reshape it to plot
test.df = test %>% t() %>% data.frame()
rownames(test.df) = colnames(tpm)
## p value of ssgsea
pval = corto::z2p(test)
colnames(pval) = rownames(test.df)ssGSEA heatmap
Color represents enrichment score
## Heatmap
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(60),
colorRampPalette(colors = c("white","#D35400"))(70))
t(test.df) %>% pheatmap::pheatmap(color = my.color,
cluster_rows = F,
cluster_cols = F,
main = "ssGSEA of HALLMARK pathways",
gaps_col = c(3,6)) # Simple heatmap Significance by p value Heatmap
Red: p value <= 0.05 (significant)
White : p value > 0.05 (not significant)
## Heatmap
input.data = (pval<=0.05)
input.data[] <- +input.data
input.data %>% pheatmap::pheatmap(color = c("grey99","salmon"),
cluster_rows = F,
cluster_cols = F,
main = "Significance by p value",
gaps_col = c(3,6)) # Simple heatmap Single pathway ssGSEA
Pathway sample 1
# Plot it
genesetname = "WP Apoptosis"
test.df = test.df %>%
mutate(significance = ifelse(TCSA_pval <= 0.05, max(test.df[,1]),0))
input = data.frame(test.df[,c(1,3)])
rownames(input) = rownames(test.df)
colnames(input) = c(genesetname, "p value < = 0.05")
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(50),
colorRampPalette(colors = c("white","#D35400"))(40))
input %>% pheatmap::pheatmap(cluster_rows = F, cluster_cols = F,
color=my.color, border_color = "grey33",
gaps_col = 1,
main = "ssGSEA with significance")Pathway sample 2
# Plot it
genesetname = "GPBP DNA Damage"
test.df = test.df %>%
mutate(significance = ifelse(TCSA_pval <= 0.05, max(test.df[,1]),0))
input = data.frame(test.df[,c(1,3)])
rownames(input) = rownames(test.df)
colnames(input) = c(genesetname, "p value < = 0.05")
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(50),
colorRampPalette(colors = c("white","#D35400"))(60))
input %>% pheatmap::pheatmap(cluster_rows = F, cluster_cols = F,
color=my.color, border_color = "grey33",
gaps_col = 1,
main = "ssGSEA with significance")Multiple genes-violin plot across samples
Define Function
# Import sample data
tpm.sample = read.csv("../sampledata/tpm.sample.csv", row.names = 1)
cellmarker = read.csv('../info/DB/CellMarker2.0.csv')
celltypes = c("B cell",
"CD8+ T cell",
"Activated T cell",
"Cancer cell",
"CD4+ T cell",
"Dendritic cell",
"Macrophage")
# Functions
mks.violinplot = function(cellname, tpm=tpm.sample){
rs <- grepl(cellname, cellmarker$cell_name, ignore.case = TRUE)
mks = cellmarker[rs, ]$marker
mks = mks[mks %in% rownames(tpm)]
tpm.df =tpm %>% filter(rownames(.) %in% mks)
tpm.df$gene = rownames(tpm.df)
p= tpm.df %>% reshape::melt() %>% ggplot(aes(.[,2], log10(.[,3]))) +
geom_violin(aes(fill = variable), alpha = 0.1, show.legend = FALSE) +
geom_boxplot(outlier.size = 0, width = 0.2, alpha = 0.5, show.legend = FALSE) +
geom_jitter(aes(color = variable), alpha = 0.5, size = 1, show.legend = FALSE) +
scale_y_continuous(labels = scales::comma) +
theme_bw() +
labs(y = paste0(cellname, " genes (log10 scaled)"),
title = paste0(cellname, " signature genes ")) +
theme(
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust=1, size=14),
panel.grid.major.x = element_blank(),
axis.title.y = element_text(size = 14), # y 축 레이블 폰트 크기 설정
plot.title = element_text(size = 20)
)
return(p)
}Correlation calculation
Method: spearman
pearson is available
Define Function
# Read CPDM PDX RNAseq TPM value
dir = "~/Desktop/DF/DFCI_Paweletz/2023_RNA_seq_PDX/"
count.mtx = read.csv(paste0(dir,"count_matrix/CPDM_PDX_RNAseq_DNAnexus/CPDM_PDX_RNAseq_count.mtx.24.04.07.csv"), row.names = 1)
tpm = read.csv(paste0(dir, "count_matrix/CPDM_PDX_RNAseq_DNAnexus/CPDM_PDX_RNAseq_tpm.24.04.07.csv"), row.names = 1)
# Function
# Spearman correlation to input gene
gene_cor_spearman = function(gene, tpm_data=tpm){
input_exp <- t(tpm_data[gene, , drop = FALSE]) # 전치하여 샘플이 열이 되게 함
# 다른 유전자 데이터 확인
other_genes <- t(tpm_data) # 전치하여 샘플이 열이 되게 함
# 스피어만 상관 계수 및 p-value 계산
results <- WGCNA::corAndPvalue(input_exp, other_genes, use = "pairwise.complete.obs", method = "spearman")
df = results$cor %>% t() %>% data.frame()
df2 = results$p %>% t() %>% data.frame()
df = cbind(df,df2)
colnames(df)[2] = "pvalue"
return(df)
}Correlation heatmap
Define Function
# Prepare data
dir <- "~/Desktop/figure/"
# Input data
adc = "ADC_gene_set.plus.CPDM.tpm.csv"
# tcsa = "TCSA_gene_set.KRAS.CPDM.tpm.csv"
tpm.adc = read.csv(paste0("~/Desktop/figure/",adc), row.names = 1)
# tpm.tcsa = read.csv(paste0("~/Desktop/figure/",tcsa), row.names = 1)
# Define Functions
# Function 1
correlation_df = function(input_exp){
input_exp = t(input_exp)
# 스피어만 상관 계수 및 p-value 계산
results <- WGCNA::cor(input_exp, use = "pairwise.complete.obs", method = "spearman")
df = results %>% data.frame() # spearman correlation
return(df)
}
# Application
input.adc = correlation_df(input_exp = tpm.adc)
#input.tcsa = correlation_df(input_exp = tpm.tcsa)
# Function 2
correlation_heatmap = function(input.data, low=60, high=80){
# Heatmap
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(low),
colorRampPalette(colors = c("white","#D35400"))(high))
input.data %>% pheatmap::pheatmap(show_rownames = F, show_colnames = F,
border_color = NA,
color = my.color,
main = "Correlation Heatmap")
}
# Function 2 version 2
correlation_heatmap = function(input.data, low=60, high=80,
showname= F, fontsize=5){
# Heatmap
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(low),
colorRampPalette(colors = c("white","#D35400"))(high))
input.data %>% pheatmap::pheatmap(show_rownames = showname,
show_colnames = showname,
fontsize = fontsize,
border_color = NA,
color = my.color,
main = "Correlation Heatmap")
}
# Function 3
# Create heatmap data frame with corresponding row/column order in heatmap
correlation_heatmap_dataframe = function(input.data, low=60, high=80){
# Heatmap
my.color=c(colorRampPalette(colors = c("#2874A6","white"))(low),
colorRampPalette(colors = c("white","#D35400"))(high))
df = input.data %>% pheatmap::pheatmap(show_rownames = F, show_colnames = F,
border_color = NA,
color = my.color,
main = "Correlation Heatmap")
input.reordered = input.data[df$tree_row$order, df$tree_col$order]
return(input.reordered)
}Application
input.data = input.adc
# correlation_heatmap(input.data = input.data)
adc.hm.df = correlation_heatmap_dataframe(input.data = input.data)correlation_heatmap(input.data = input.data)
# To save heatmap as pdf format
dir <- "~/Desktop/figure/"
pdf(paste0(dir,"ADC_heatmap_with_names.pdf"), width = 10, height = 10)
correlation_heatmap(input.data = input.data, showname = T, fontsize = 5)
dev.off()
# To save heatmap data frame
input.data = input.adc
adc.hm.df = correlation_heatmap_dataframe(input.data = input.data)
adc.hm.df %>% write.csv(paste0(dir,"cor_spearman_adc_heatmap_order.csv"))Dotplot gallery
Simple code
# Modify the data
# df : a typical TPM data frame
df.melt = df %>% reshape::melt()
# Set colors
my_palette <- c(colorRampPalette(colors = c("#C8C7C7", "#FDABAB"))(80),
colorRampPalette(colors = c("#FDABAB", "#BB0202"))(180))
# Use the custom palette in ggplot2
df.melt %>% ggplot(aes(variable, gene, fill = value)) +
geom_point(shape = 21, size=6, color = "#C8C7C7") +
scale_fill_gradientn(colors = my_palette) +
theme_classic() +
theme(axis.text.x = element_text(angle = 9, hjust = 1))Version 1
dir= '~/Desktop/DF/DFCI_Barbie/DFCI_Barbie_NK_Marco/'
df.all = read.csv(paste0(dir,"data/additional_request/Fig4A.GSEA.csv"),
row.names = 1, stringsAsFactors = T)
# df.all = df.all %>% filter(sample != "H69M")
# Reorder x-axis and y-axis labels
df.all$sample = factor(df.all$sample, levels = c("CORL47","H82","H69M","H69EZ_G1","H69EZ_GV",
"H196"))
df.all$ID = factor(df.all$ID, levels= c("INTERFERON_GAMMA_RESPONSE",
"INFLAMMATORY_RESPONSE",
"CHEMOKINE_PRODUCTION",
"ALLOGRAFT_REJECTION",
"TGF_BETA_SIGNALING",
"NEUROENDOCRINE"))
# Draw plot
df.all %>% ggplot(aes(x = sample, y = forcats::fct_rev(ID), fill = NES, size = p.adjust)) +
geom_point(shape=21, color="darkgrey") +
theme_classic() +
scale_size_continuous(range = c(6,2), name = "FDR") + # 상단 범례 제목 설정
scale_fill_gradient2(low = "darkblue", mid = "white", high = "red", name = "NES") +
xlab("") +
ylab("") +
ggtitle("Cell Line / H69") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggeasy::easy_center_title()Two panel version
Two panels: A) H69M, H69EZ_G1, H69EZ_GV; B) CORL47, H82, H196
dir= '~/Desktop/DF/DFCI_Barbie/DFCI_Barbie_NK_Marco/'
df.all = read.csv(paste0(dir,"data/additional_request/Fig4A.GSEA.csv"),
row.names = 1, stringsAsFactors = T)
# df.all = df.all %>% filter(sample != "H69M")
# Reorder x-axis and y-axis labels
df.all$sample = factor(df.all$sample, levels = c("H69M","H69EZ_G1","H69EZ_GV",
"CORL47","H82","H196"))
df.all$ID = factor(df.all$ID, levels= c("INTERFERON_GAMMA_RESPONSE",
"INFLAMMATORY_RESPONSE",
"CHEMOKINE_PRODUCTION",
"ALLOGRAFT_REJECTION",
"TGF_BETA_SIGNALING",
"NEUROENDOCRINE"))
# Draw plot
p1=df.all %>% filter(sample %in% c("H69M","H69EZ_G1","H69EZ_GV")) %>%
ggplot(aes(x = sample, y = forcats::fct_rev(ID), fill = NES, size = p.adjust)) +
geom_point(shape=21, color="darkgrey") +
theme_classic() +
scale_size_continuous(range = c(6,2), name = "FDR", guide = guide_legend(order = 1)) +
scale_fill_gradient2(low = "darkblue", mid = "white", high = "red", name = "NES") +
xlab("") +
ylab("") +
ggtitle("Cell Line / H69") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggeasy::easy_center_title()
p2=df.all %>% filter(sample %in% c("CORL47","H82","H196")) %>%
ggplot(aes(x = sample, y = forcats::fct_rev(ID), fill = NES, size = p.adjust)) +
geom_point(shape=21, color="darkgrey") +
theme_classic() +
scale_size_continuous(range = c(6,2), name = "FDR") + # 상단 범례 제목 설정
scale_fill_gradient2(low = "darkblue", mid = "white", high = "red", name = "NES") +
xlab("") +
ylab("") +
ggtitle("Cell Line / H69") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
ggeasy::easy_center_title()
cowplot::plot_grid(p1,p2, ncol = 2)
More example analyses will be updated.